DocumentConverter.java example

Explorer

awesome-blogs-android-master
- api
  - src
    - androidTest
      - java
        org
        petabytes
        api
        source
        local
        AwesomeBlogsLocalSourceTest.java
    - main
      - java
        org
        petabytes
        api
        Api.java
        DataSource.java
        source
        local
        AwesomeBlogsLocalSource.java
        Entry.java
        Feed.java
        Migration.java
        Read.java
        remote
        AwesomeBlogsRemoteSource.java
        Entry.java
        Feed.java
        NetworkInterceptor.java
        util
        Dates.java
    - test
      - java
        org
        petabytes
        api
        ApiTest.java
        AwesomeBlogsRemoteSourceTest.java
        util
        DatesTest.java
- app
  - src
    - debug
      - java
        org
        petabytes
        awesomeblogs
        DebugActivityLayoutBinder.java
        DebugAwesomeBlogsApp.java
        DebugCoordinator.java
    - main
      - java
        org
        petabytes
        awesomeblogs
        AwesomeBlogsApp.java
        auth
        Authenticator.java
        SignInActivity.java
        SignInCoordinator.java
        User.java
        author
        AuthorActivity.java
        AuthorCoordinator.java
        base
        AwesomeActivity.java
        Verifiable.java
        chrome
        ActionReceiver.java
        Chromes.java
        digest
        DigestService.java
        Schedulers.java
        StartUpReceiver.java
        fcm
        InstanceIdService.java
        MessagingService.java
        Notifications.java
        feeds
        DrawerCoordinator.java
        EntryCircleCoordinator.java
        EntryDiagonalCoordinator.java
        EntryEntireCoordinator.java
        EntryRowsCoordinator.java
        FeedsActivity.java
        FeedsCoordinator.java
        history
        HistoryActivity.java
        HistoryCoordinator.java
        referrer
        InstallReferrerReceiver.java
        settings
        SettingsActivity.java
        SettingsCoordinator.java
        summary
        FooterCoordinator.java
        SummaryActivity.java
        SummaryCoordinator.java
        util
        Alerts.java
        Analytics.java
        Devices.java
        Intents.java
        LifeCycles.java
        LineHeightSpan.java
        Preferences.java
        Strings.java
        Truss.java
        Views.java
        widget
        CircleView.java
    - test
      - java
        org
        petabytes
        awesomeblogs
        util
        StringsTest.java
- coordinator
  - src
    - main
      - java
        org
        petabytes
        coordinator
        Activity.java
        ActivityGraph.java
        ActivityLayoutBinder.java
        BasePresenter.java
        BaseView.java
        Coordinator.java
        PagerAdapter.java
        PagerFactory.java
        RecyclerAdapter.java
        RecyclerFactory.java
        RxBinder.java
- markDownView
  - src
    - main
      - java
        eu
        fiskur
        markdownview
        MarkdownActivity.java
        MarkdownView.java
        NestedWebView.java
- remark
  - src
    - main
      - java
        com
        overzealous
        remark
        IgnoredHtmlElement.java
        Options.java
        Remark.java
        convert
        Abbr.java
        AbstractNodeHandler.java
        Anchor.java
        BlockQuote.java
        Break.java
        Codeblock.java
        DefaultNodeHandler.java
        Definitions.java
        DocumentConverter.java
        Header.java
        HorizontalRule.java
        Image.java
        InlineCode.java
        InlineStyle.java
        List.java
        NodeHandler.java
        NodeRemover.java
        Paragraph.java
        Table.java
        TextCleaner.java
        package-info.java
        package-info.java
        util
        BlockWriter.java
        MarkdownTable.java
        MarkdownTableCell.java
        StringUtils.java
        package-info.java
    - test
      - java
        com
        overzealous
        remark
        convert
        AnchorInlineTest.java
        AnchorRelativeLinkTest.java
        AnchorSimpleIdTest.java
        BrokenHTMLTest.java
        DocumentConverterTest.java
        GithubTest.java
        IgnoredHTMLTest.java
        InwordEmphasisSpacedTest.java
        MarkdownExtraTest.java
        MultiMarkdownTest.java
        PegdownTest.java
        PlainMarkdownTest.java
        RemarkTester.java
        TableRemoveTest.java
        TextCleanerTest.java
        util
        BlockWriterTest.java
        MarkdownTableTest.java
        StringUtilsTest.java
        TestUtils.java
- slidingMenu
  - src
    - com
      - jeremyfeinstein
        slidingmenu
        lib
        CanvasTransformerBuilder.java
        CustomViewAbove.java
        CustomViewBehind.java
        MenuInterface.java
        SlidingMenu.java
        app
        SlidingActivity.java
        SlidingActivityBase.java
        SlidingActivityHelper.java
        SlidingFragmentActivity.java
        SlidingListActivity.java
        SlidingPreferenceActivity.java

/*
 * Copyright 2011 OverZealous Creations, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.overzealous.remark.convert;

import com.overzealous.remark.IgnoredHtmlElement;
import com.overzealous.remark.Options;
import com.overzealous.remark.util.BlockWriter;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

import java.io.OutputStream;
import java.io.Writer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The class that does the heavy lifting for converting a JSoup Document into
 * valid Markdown
 *
 * @author Phil DeJarnett
 */
public class DocumentConverter {

	// These properties do not change for the life of this converter
	final Options options;
	final TextCleaner cleaner;
	private final Set<String> ignoredHtmlTags;
	final Map<String,NodeHandler> blockNodes;
	final Map<String,NodeHandler> inlineNodes;

	// These properties change for each conversion
	private Map<String,String> linkUrls; // for looking up links via URL
	private int genericLinkUrlCounter;
	private int genericImageUrlCounter;
	private Map<String,String> linkIds; // an inverse of linkUrls, for looking up links via ID
	private Map<String,String> abbreviations; // a cache of abbreviations mapped by abbreviated form
	BlockWriter output; // the output writer, which may change during recursion

	private Map<String,NodeHandler> lastNodeset;

	private static final Pattern COMMA = Pattern.compile(",");
	private static final Pattern LINK_MULTIPLE_SPACES = Pattern.compile(" {2,}", Pattern.DOTALL);
	private static final Pattern LINK_SAFE_CHARS = Pattern.compile("[^-\\w \\.]+", Pattern.DOTALL);
	private static final String LINK_REPLACEMENT = "_";
	private static final Pattern LINK_EDGE_REPLACE = Pattern.compile(String.format("(^%1$s++)|(%1$s++$)", LINK_REPLACEMENT));
	private static final Pattern LINK_MULTIPLE_REPLACE = Pattern.compile(String.format("%1$s{2,}", LINK_REPLACEMENT));
	private static final Pattern LINK_FILENAME = Pattern.compile("/([^/]++)$");


	/**
	 * Creates a DocumentConverted with the given options.
	 * @param options Options for this converter.
	 */
	public DocumentConverter(Options options) {
		// configure final properties
		this.options = options;
		cleaner = new TextCleaner(options);
		ignoredHtmlTags = new HashSet<String>();
		blockNodes = new HashMap<String, NodeHandler>();
		inlineNodes = new HashMap<String, NodeHandler>();

		// configure ignored tags
		for(final IgnoredHtmlElement ihe : options.getIgnoredHtmlElements()) {
			ignoredHtmlTags.add(ihe.getTagName());
		}
		
		configureNodes();
	}

	private void configureNodes() {
		addInlineNode(new InlineStyle(),	"i,em,b,strong,font,span,del,strike,s");
		addInlineNode(new InlineCode(),		"code,tt");
		addInlineNode(new Image(),			"img");
		addInlineNode(new Anchor(),			"a");
		addInlineNode(new Break(),			"br");
		addBlockNode (new Header(),			"h1,h2,h3,h4,h5,h6");
		addBlockNode (new Paragraph(),		"p");
		addBlockNode (new Codeblock(),		"pre");
		addBlockNode (new BlockQuote(),		"blockquote");
		addBlockNode (new HorizontalRule(),	"hr");
		addBlockNode (new List(),			"ol,ul");

		if(options.abbreviations) {
			addInlineNode(new Abbr(),		"abbr,acronym");
		}

		if(options.definitionLists) {
			addBlockNode(new Definitions(),	"dl");
		}


		// TABLES
		if(options.getTables().isConvertedToText()) {
			// if we are going to process it, add the handler
			addBlockNode(new Table(),		"table");

		} else if(options.getTables().isRemoved()) {
			addBlockNode(NodeRemover.getInstance(), "table");

		} // else, it's being added directly
	}

	@SuppressWarnings({"UnusedDeclaration"})
	public Options getOptions() {
		return options;
	}

	@SuppressWarnings({"UnusedDeclaration"})
	public TextCleaner getCleaner() {
		return cleaner;
	}

	@SuppressWarnings({"UnusedDeclaration"})
	public Map<String, NodeHandler> getBlockNodes() {
		return Collections.unmodifiableMap(blockNodes);
	}

	@SuppressWarnings({"UnusedDeclaration"})
	public Map<String, NodeHandler> getInlineNodes() {
		return Collections.unmodifiableMap(inlineNodes);
	}

	@SuppressWarnings({"UnusedDeclaration"})
	public BlockWriter getOutput() {
		return output;
	}

	@SuppressWarnings({"UnusedDeclaration"})
	public void setOutput(BlockWriter output) {
		this.output = output;
	}

	/**
	 * Customize the processing for a node.  This node is added to the
	 * inline list and the block list.  The inline list is used for nodes
	 * that do not contain linebreaks, such as {@code <em>} or {@code <strong>}.
	 *
	 * The tagnames is a comma-delimited list of tagnames for
	 * which this handler should be applied.
	 *
	 * @param handler The handler for the nodes
	 * @param tagnames One or more tagnames
	 */
	@SuppressWarnings({"WeakerAccess"})
	public void addInlineNode(NodeHandler handler, String tagnames) {
		for(final String key : COMMA.split(tagnames)) {
			if(key.length() > 0) {
				inlineNodes.put(key, handler);
				blockNodes.put(key, handler);
			}
		}
	}


	/**
	 * Customize the processing for a node.  This node is added to the
	 * block list only.  The node handler should properly use the
	 * {@link com.overzealous.remark.util.BlockWriter#startBlock()} and
	 * {@link com.overzealous.remark.util.BlockWriter#endBlock()} methods as
	 * appropriate.
	 *
	 * The tagnames is a comma-delimited list of tagnames for
	 * which this handler should be applied.
	 *
	 * @param handler The handler for the nodes
	 * @param tagnames One or more tagnames
	 */
	@SuppressWarnings({"WeakerAccess"})
	public void addBlockNode(NodeHandler handler, String tagnames) {
		for(final String key : COMMA.split(tagnames)) {
			if(key.length() > 0) {
				blockNodes.put(key, handler);
			}
		}
	}

	/**
	 * Convert a document to the given writer.
	 *
	 * <p><strong>Note: It is up to the calling class to handle closing the writer!</strong></p>
	 *
	 * @param doc Document to convert
	 * @param out Writer to receive the final output
	 */
	public void convert(Document doc, Writer out) {
		this.output = new BlockWriter(out, true);
		this.convertImpl(doc);
	}

	/**
	 * Convert a document to the given output stream.
	 *
	 * <p><strong>Note: It is up to the calling class to handle closing the stream!</strong></p>
	 *
	 * @param doc Document to convert
	 * @param out OutputStream to receive the final output
	 */
	public void convert(Document doc, OutputStream out) {
		this.output = new BlockWriter(out, true);
		this.convertImpl(doc);
	}

	/**
	 * Convert a document and return a string.
	 * When wanting a final string, this method should always be used.
	 * It will attempt to calculate the size of the buffer necessary to hold the entire output.
	 *
	 * @param doc Document to convert
	 * @return The Markdown-formatted string.
	 */
	public String convert(Document doc) {
		// estimate the size necessary to handle the final output
		BlockWriter bw = BlockWriter.create(DocumentConverter.calculateLength(doc, 0));
		this.output = bw;
		this.convertImpl(doc);
		return bw.toString();
	}

	// Utility method to quickly walk the DOM tree and estimate the size of the
	// buffer necessary to hold the result.
	private static int calculateLength(Element el, int depth) {
		int result = 0;
		for(final Node n : el.childNodes()) {
			if(n instanceof Element) {
				result += (4 * depth) + calculateLength((Element)n, depth+1);
			} else if(n instanceof TextNode) {
				result += ((TextNode)n).text().length();
			}
		}
		return result;
	}

	// implementation of the convert method.  Basically handles setting up the
	private void convertImpl(Document doc) {
		
		// linked, because we want the resulting list of links in order they were added
		linkIds = new LinkedHashMap<String, String>();
		// To keep track of already added URLs
		linkUrls = new HashMap<String, String>();
		genericImageUrlCounter = 0;
		genericLinkUrlCounter = 0;
		// linked, to keep abbreviations in the order they were added
		abbreviations = new LinkedHashMap<String, String>();

		lastNodeset = blockNodes;

		// walk the DOM
		walkNodes(DefaultNodeHandler.getInstance(), doc.body(), blockNodes);

		if(!linkIds.isEmpty()) {
			// Add links
			output.startBlock();
			for(final Map.Entry<String,String> link : linkIds.entrySet()) {
				output.printf("\n[%s]: %s", link.getKey(), link.getValue());
			}
			output.endBlock();
		}
		if(!abbreviations.isEmpty()) {
			// Add abbreviations
			output.startBlock();
			for(final Map.Entry<String,String> abbr : abbreviations.entrySet()) {
				output.printf("\n*[%s]: %s", abbr.getKey(), cleaner.clean(abbr.getValue()));
			}
			output.endBlock();
		}

		// free up unused properties
		linkIds = null;
		linkUrls = null;
		abbreviations = null;
		output = null;
	}

	/**
	 * Loops over the children of an HTML Element, handling TextNode and child Elements.
	 *
	 * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements.
	 * @param el The parent HTML Element whose children are being looked at.
	 */
	public void walkNodes(NodeHandler currentNode, Element el) {
		walkNodes(currentNode, el, lastNodeset);
	}

	/**
	 * Loops over the children of an HTML Element, handling TextNode and child Elements.
	 *
	 * @param currentNodeHandler The default node handler for TextNodes and IgnoredHTMLElements.
	 * @param el The parent HTML Element whose children are being looked at.
	 * @param nodeList The list of valid nodes at this level.  Should be one of <b>blockNodes</b> or <b>inlineNodes</b>
	 */
	public void walkNodes(NodeHandler currentNodeHandler, Element el, Map<String, NodeHandler> nodeList) {
		Map<String, NodeHandler> backupLastNodeset = lastNodeset;
		lastNodeset = nodeList;
		for(final Node n : el.childNodes()) {
			if(n instanceof TextNode) {
				// It's just text!
				currentNodeHandler.handleTextNode((TextNode) n, this);

			} else if(n instanceof Element) {
				// figure out who can handle this
				Element node = (Element)n;
				String tagName = node.tagName();

				if(nodeList.containsKey(tagName)) {
					// OK, we know how to handle this node
					nodeList.get(tagName).handleNode(currentNodeHandler, node, this);

				} else if(ignoredHtmlTags.contains(tagName)) {
					// User wants to leave this tag in the output.  Naughty user.
					currentNodeHandler.handleIgnoredHTMLElement(node, this);

				} else {
					// No-can-do, just remove the node, and keep on walkin'
					// The only thing we'll do is add block status in if the unknown node
					// usually renders as a block.
					// Due to BlockWriter's intelligent tracking, we shouldn't get a whole bunch
					// of empty lines for empty nodes.
					if(node.isBlock()) {
						output.startBlock();
					}
					walkNodes(currentNodeHandler, node, nodeList);
					if(node.isBlock()) {
						output.endBlock();
					}					
				}
			} // else: not a node we care about (e.g.: comment nodes)
		}
		lastNodeset = backupLastNodeset;
	}

	/**
	 * Recursively processes child nodes and returns the potential output string.
	 * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements.
	 * @param el The parent HTML Element whose children are being looked at.
	 * @return The potential output string.
	 */
	public String getInlineContent(NodeHandler currentNode, Element el) {
		return this.getInlineContent(currentNode, el, false);
	}

	/**
	 * Recursively processes child nodes and returns the potential output string.
	 * @param currentNode The default node handler for TextNodes and IgnoredHTMLElements.
	 * @param el The parent HTML Element whose children are being looked at.
	 * @param undoLeadingEscapes If true, leading escapes are removed
	 * @return The potential output string.
	 */
	public String getInlineContent(NodeHandler currentNode, Element el, boolean undoLeadingEscapes) {
		BlockWriter oldOutput = output;
		output = BlockWriter.create(1000);
		walkNodes(currentNode, el, inlineNodes);
		String ret = output.toString();
		output = oldOutput;
		if(undoLeadingEscapes) {
			ret = cleaner.unescapeLeadingCharacters(ret);
		}
		return ret;
	}

	/**
	 * Adds a link to the link set, and returns the actual ID for the link.
	 *
	 * @param url URL for link
	 * @param recommendedName A recommended name for non-simple link IDs. This might be modified.
	 * @param image If true, use "img-" instead of "link-" for simple link IDs.
	 * @return The actual link ID for this URL.
	 */
	public String addLink(String url, String recommendedName, boolean image) {
		String linkId;
		if(linkUrls.containsKey(url)) {
			linkId = linkUrls.get(url);
		} else {
			if(options.simpleLinkIds) {
				linkId = (image ? "image-" : "") + String.valueOf(linkUrls.size()+1);
			} else {
				recommendedName = cleanLinkId(url, recommendedName, image);
				if(linkIds.containsKey(recommendedName)) {
					int incr = 1;
					while(linkIds.containsKey(String.format("%s %d", recommendedName, incr))) {
						incr++;
					}
					recommendedName = String.format("%s %d", recommendedName, incr);
				}
				linkId = recommendedName;
			}
			linkUrls.put(url, linkId);
			linkIds.put(linkId, url);
		}
		return linkId;
	}

	/**
	 * Adds an abbreviation to the abbreviation set.
	 * @param abbr The abbreviation to be used
	 * @param definition The definition for the abbreviation, should NOT be pre-escaped.
	 */
	void addAbbreviation(String abbr, String definition) {
		if(!abbreviations.containsKey(abbr)) {
			abbreviations.put(abbr, definition);
		}
	}

	String cleanLinkId(String url, String linkId, boolean image) {
		// no newlines
		String ret = linkId.replace('\n', ' ');
		// multiple spaces should be a single space
		ret = LINK_MULTIPLE_SPACES.matcher(ret).replaceAll(" ");
		// remove all characters except letters, numbers, spaces, and some basic punctuation
		ret = LINK_SAFE_CHARS.matcher(ret).replaceAll(LINK_REPLACEMENT);
		// replace multiple underscores with a single underscore
		ret = LINK_MULTIPLE_REPLACE.matcher(ret).replaceAll(LINK_REPLACEMENT);
		// replace underscores on the left or right with nothing
		ret = LINK_EDGE_REPLACE.matcher(ret).replaceAll("");
		// trim any leading or trailing spaces
		ret = ret.trim();
		if(ret.length() == 0 || ret.equals(LINK_REPLACEMENT)) {
			// if we have nothing usable left, use a generic ID
			if(image) {
				if(url != null) {
					Matcher m = LINK_FILENAME.matcher(url);
					if(m.find()) {
						ret = cleanLinkId(null, m.group(1), true);
					} else {
						genericImageUrlCounter++;
						ret = "Image " + genericImageUrlCounter;
					}
				} else {
					genericImageUrlCounter++;
					ret = "Image " + genericImageUrlCounter;
				}
			} else {
				genericLinkUrlCounter++;
				ret = "Link " + genericLinkUrlCounter;
			}
		} // else, use the cleaned id
		return ret;
	}
}